---
title: "Bilingual Ballot"
output:
  html_document:
    df_print: paged
---

This is the R script for analyzing Taiwan and Texas bilingual ballot datasets.


```{r load data,message=FALSE}
library(tidyverse)
library(readxl)
library(MASS)     # for polr
library(cregg)
library(fixest)
library(patchwork)
library(stringr)

taiwan <- read_excel("Bilingual Ballots _Selsky.xlsx")
texas <- read_excel("Bilingual Ballots _US Sample.xlsx")
texas <-  texas[-1,]
#data("immigration")
```

# Taiwan data

The TW conjoint experiment has two attributes:

* language: Chinese only; CN and Paiwan (PAI); CN and Vietnamese (VIET); CN-PAI-VIET 
* arrangement: stacked; separated

This design yields 7 profiles: 

 Profile   Lanuage        Design    
--------- --------------- -------------- 
     A     Chinese only    N/A        
     B     CN-PAI          Stacked     
     C     CN-VIET         Stacked      
     D     CN-PAI-VIET     Stacked     
     E     CN-PAI          Separated     
     F     CN-VIET         Separated      
     G     CN-PAI-VIET     Separated 
     

The below preliminary analysis shows that Taiwanese respondants appear to prefer all three languages stacked together. 

```{r create profiles}
# colnames(taiwan)
#Remove rows with NA's using na.omit()
taiwan <- na.omit(taiwan)
# create ballot profiles
ballot_profile <- data.frame (
  profile  = c("A", "B", "C", "D", "E", "F", "G"),
  language = c("CN", "CN-PAI", "CN-VIET","CN-PAI-VIET","CN-PAI", "CN-VIET","CN-PAI-VIET"),
  design = c("stacked","stacked","stacked","stacked","separated","separated","separated"))
```

```{r recode party blocks}
unique(taiwan$party) # check party variable value

# Recode Taiwan party into 3 blocs
taiwan <- taiwan %>%
  mutate(
    party_block = case_when(
      party %in% c("KMT", "NP", "PFP") ~ "Pan-blue",
      party %in% c("DPP", "TSU", "Green Party", "NPP", "SDP", "TSP") ~ "Pan-green",
      party %in% c("TPP", "I do not support any of these party") ~ "Non-partisan", TRUE ~ NA_character_ 
     # Drops "Prefer not to answer" as NA
    ),
    party_block = factor(party_block, levels = c("Pan-blue", "Pan-green", "Non-partisan"))
  )

```

## Fairness
```{r}
tw_fair <- taiwan %>%
  # subset data
  dplyr::select(unique, party_block, fair_yes, fair_no) %>%
  # reshape data
  tidyr::pivot_longer(cols      = c(fair_yes, fair_no),
                      names_to  = "condition", values_to = "measurement") %>% 
  # match choice with profile
  mutate(choice = if_else(condition == "fair_yes", 1L, 0L)) %>%
  inner_join(ballot_profile, by = c("measurement" = "profile")) %>%
  # as factor
  mutate(language = factor(language), design   = factor(design),
         unique   = factor(unique)
  )

# descriptive plotting
fairness <- choice ~ language + design
# estimate Average Marginal Component Effects
amces <- cj(
  data    = tw_fair,
  formula = fairness,
  id      = ~ unique,
  family  = binomial
)
head(amces[c("feature", "level", "estimate", "std.error")], 20L)
plot(amces)
```


## democratic
```{r}
tw_dem <- taiwan %>% # subset data
  dplyr::select(unique, party_block, democratic_yes, democratic_no) %>%
   # reshape data
  tidyr::pivot_longer(cols      = c(democratic_yes, democratic_no),
                      names_to  = "condition", values_to = "measurement") %>% 
  # match choice with profile
  mutate(choice = if_else(condition == "democratic_yes", 1L, 0L) ) %>% 
  inner_join(ballot_profile, by = c("measurement" = "profile")) %>%
  # as factor
  mutate(language = factor(language), design   = factor(design),
         unique   = factor(unique))
# descriptive plotting
democratic <- choice ~ language + design
# estimate Average Marginal Component Effects
amces_d <- cj(tw_dem, democratic, id = ~unique,family = binomial)
head(amces_d[c("feature", "level", "estimate", "std.error")], 20L)
plot(amces_d)
```

## efficiancy
```{r}

tw_effi <- taiwan %>%
  dplyr::select(unique, party_block, efficient_yes, efficient_no) %>%
  tidyr::pivot_longer(cols = c(efficient_yes, efficient_no),
                      names_to  = "condition", 
                      values_to = "measurement") %>%
  mutate(choice = if_else(condition == "efficient_yes", 1L, 0L)) %>%
  inner_join(ballot_profile, by = c("measurement" = "profile")) %>%
  mutate(language = factor(language), 
         design = factor(design), 
         unique = factor(unique))

# descriptive plotting
efficient <- choice ~ language + design
# estimate Average Marginal Component Effects
amces_e <- cj(tw_effi, efficient, id = ~unique,family = binomial)
head(amces_e[c("feature", "level", "estimate", "std.error")], 20L)
plot(amces_e)
```
## subgroup analysis by party

### Fairness
```{r Fairness by party block}
# Pan-blue
tw_fair_blue <- subset(tw_fair, party_block == "Pan-blue")
amces_fair_blue <- cj( data = tw_fair_blue, formula = fairness,
                       id = ~ unique, family  = binomial)
# Pan-green
tw_fair_green <- subset(tw_fair, party_block == "Pan-green")
amces_fair_green <- cj( data = tw_fair_green, formula = fairness,
                        id= ~ unique, family  = binomial)

# Non-partisantw_fair_blue
tw_fair_np <- subset(tw_fair, party_block == "Non-partisan")
amces_fair_np <- cj(data = tw_fair_np, formula = fairness, 
                    id = ~ unique, family  = binomial)


# visualization
p_fair_blue  <- plot(amces_fair_blue)  + ggtitle("Pan-blue")
p_fair_green <- plot(amces_fair_green) + ggtitle("Pan-green")
p_fair_np    <- plot(amces_fair_np)    + ggtitle("Non-partisan")

p_fair <- (p_fair_blue / p_fair_green / p_fair_np) +
  plot_annotation(title = "Fairness AMCEs by Party Bloc")

ggsave("fairness_subgroups.png", p_fair, width = 9, height = 12, dpi = 300)
```

### Democratic
```{r Democratic by party block}

# Pan-blue
tw_dem_blue <- subset(tw_dem, party_block == "Pan-blue")
amces_dem_blue <- cj(data = tw_dem_blue, formula = democratic,
                    id    = ~ unique,    family  = binomial)

# Pan-green
tw_dem_green <- subset(tw_dem, party_block == "Pan-green")
amces_dem_green <- cj(data = tw_dem_green, formula = democratic, 
                      id = ~ unique, family  = binomial)

# Non-partisan
tw_dem_np <- subset(tw_dem, party_block == "Non-partisan")
amces_dem_np <- cj(data = tw_dem_np, formula = democratic, id = ~ unique, 
                   family  = binomial)

## Visualization
p_dem_blue  <- plot(amces_dem_blue)  + ggtitle("Pan-blue")
p_dem_green <- plot(amces_dem_green) + ggtitle("Pan-green")
p_dem_np    <- plot(amces_dem_np)    + ggtitle("Non-partisan")

p_dem <- (p_dem_blue / p_dem_green / p_dem_np) +
  plot_annotation(title = "Democratic AMCEs by Party Bloc")

ggsave("democratic_subgroups.png", p_dem,width = 9, height = 12, dpi = 300)

```

### Efficiancy
```{r Efficiency by party block}

# Pan-blue
tw_effi_blue <- subset(tw_effi, party_block == "Pan-blue")
amces_effi_blue <- cj(data = tw_effi_blue, formula = efficient,
                      id = ~ unique, family  = binomial)

# Pan-green
tw_effi_green <- subset(tw_effi, party_block == "Pan-green")
amces_effi_green <- cj(data  = tw_effi_green, formula = efficient,
                       id = ~ unique, family  = binomial)

# Non-partisan
tw_effi_np <- subset(tw_effi, party_block == "Non-partisan")
amces_effi_np <- cj( data = tw_effi_np, formula = efficient,
                     id = ~ unique, family  = binomial)

## Visualization
p_effi_blue  <- plot(amces_effi_blue)  + ggtitle("Pan-blue")
p_effi_green <- plot(amces_effi_green) + ggtitle("Pan-green")
p_effi_np    <- plot(amces_effi_np)    + ggtitle("Non-partisan")

p_effi <- (p_effi_blue / p_effi_green / p_effi_np) +
  plot_annotation(title = "Efficiency AMCEs by Party Bloc")

ggsave("efficiency_subgroups.png",
       p_effi, width = 9, height = 12,dpi = 300)

```


# TX data

The TX conjoint experiment design has two attributes:

* language: monolingual (EN only); bilingual (EN and ES)
* arrangement: stacked; separated

This design yields three possible profiles:

* Ballot A: English only monolingual
* Ballot B: English-Spanish bilingual stacked together
* Ballot C: English-Spanish bilingual separated into columns

## Attitudes toward bilingual ballot 
```{r}
# remove those who choose the same design in both Q2 and Q4
texas <- texas[texas$Q4 != texas$Q2, ] 
#Remove rows with NA's using na.omit()
texas <- texas |> drop_na(Q2, Q4)
# rank preference toward bilingual ballots
texas <- texas  %>%  mutate(
  Q4_1 = if_else(Q4 == "Ballot A: English only monolingual", "A", "B/C")
  )
texas <- texas  %>%  mutate(Q2_1 = ifelse(
    texas$Q2 == "Ballot A: English only monolingual", 
    yes = "A", no = "B/C"))
texas <- texas  %>%  mutate(attitudes_bilingual = case_when(
    texas$Q2_1== "A" & texas$Q4_1== "B/C" ~ -1,
    texas$Q2_1== "B/C" & texas$Q4_1== "A" ~ 1,
    texas$Q2_1== "B/C" & texas$Q4_1== "B/C" ~ 0))
hist(texas$attitudes_bilingual)
```

## Attitudes toward stacked/separated designs

For respondents who are OK with including Spanish in the ballot, they seem to prefer having them in separated columns. I think it makes sense because Spanish and English both uses Latin alphabets, and it is hard to read when they are mixed together. However, it might not be a problem for languages like Chinese.
```{r}
# rank preferences on stacked/separated design
knitr::kable(round(prop.table(table(texas$Q2,texas$attitudes_bilingual)), 3))
barplot(prop.table(table(texas$Q2,texas$attitudes_bilingual)),
        legend.text = TRUE,
        args.legend = list(x = "topleft"))
```

## relationship with other controls 

```{r}
# code demographics
# age
texas$age <- 2022 - as.numeric(texas$Q7_1)
# gender
texas <- texas  %>%  mutate(male = ifelse(
    texas$Q8 == "Male", yes = 1, no = 0))
# ideology
texas <- texas  %>%  mutate(ideology = case_when(
    texas$Q11 == "Strongly liberal"~1, texas$Q11 == "Libral"~2,
    texas$Q11 == "Somewhat liberal"~3, texas$Q11 == "Moderate"~4,
    texas$Q11 == "Somewhat conservative"~5, texas$Q11 == "Conservative"~6,
    texas$Q11 == "Strongly conservative"~7))
# spanish proficiency
texas <- texas  %>%  mutate(spanish_prof = case_when(
    texas$Q16 == "I can speak Spanish fluently"~3, 
    texas$Q16 == "I can speak it well enough to get by in daily life"~2,
    texas$Q16 == "Very little"~1, 
    texas$Q16 == "Not at all"~0))
# English only household
texas <- texas  %>%  mutate(speak_English_only = ifelse(
    texas$Q14 == "English", yes = 1, no = 0))
# partisanship (simple)
# table(texas$Q9,texas$Q10 )
texas <- texas  %>%  mutate(partisanship = case_when(
    texas$Q9 == "Democrat"~2, texas$Q9 == "Independent"~4,
    texas$Q9 == "Other"~4, texas$Q9 == "Republican"~6))
#    
```

```{r}
# code ordinal outcomes
texas$attitudes_bilingual <- ordered(
  texas$attitudes_bilingual,
  levels = c(-1, 0, 1)
)
# run models
txm1 <- polr(attitudes_bilingual ~ speak_English_only,
             data = texas, method = "logistic", Hess = TRUE)

txm2 <- polr(attitudes_bilingual ~ spanish_prof,
             data = texas,  method = "logistic", Hess = TRUE)

txm3 <- polr(attitudes_bilingual ~ ideology,
             data = texas,  method = "logistic", Hess = TRUE)

txm4 <- polr(attitudes_bilingual ~ age + male,
             data = texas,  method = "logistic", Hess = TRUE)

txm5 <- polr(attitudes_bilingual ~ partisanship,
             data = texas,  method = "logistic", Hess = TRUE)
# preview some results
results <- list( speak_English = txm1$coefficients,
                 spanish_prof = txm2$coefficients, 
                 ideology = txm3$coefficients,
                 age_gender = txm4$coefficients,
                 partisanship = txm5$coefficients
)

knitr::kable(unlist(results))

```

## Robustness checkes
```{r}
survey_year <- 2022

texas2 <- texas %>%
  # keep only complete ballot responses
  drop_na(Q2, Q4) %>%
  mutate(
    Q2_ballot = case_when(
      Q2 == "Ballot A: English only monolingual" ~ "A",
      Q2 == "Ballot B: English-Spanish bilingual stacked together" ~ "B",
      Q2 == "Ballot C: English-Spanish bilingual separated into columns" ~ "C",
      TRUE ~ NA_character_
    ),
    Q4_ballot = case_when(
      Q4 == "Ballot A: English only monolingual" ~ "A",
      Q4 == "Ballot B: English-Spanish bilingual stacked together" ~ "B",
      Q4 == "Ballot C: English-Spanish bilingual separated into columns" ~ "C",
      TRUE ~ NA_character_
    )
  ) %>%
  drop_na(Q2_ballot, Q4_ballot) %>%
  # drop those who picked same ballot as most & least
  filter(Q4_ballot != Q2_ballot) %>%
  mutate(
    Q2_is_A = if_else(Q2_ballot == "A", 1L, 0L),
    Q4_is_A = if_else(Q4_ballot == "A", 1L, 0L),

    # original 3-category outcome: -1 (anti-bilingual), 0 (both bilingual), 1 (pro-bilingual)
    attitudes_bilingual = case_when(
      Q2_is_A == 1L & Q4_is_A == 0L ~ -1,
      Q2_is_A == 0L & Q4_is_A == 1L ~  1,
      Q2_is_A == 0L & Q4_is_A == 0L ~  0,
      TRUE ~ NA_real_
    ),

    # Outcome 1: favorite ballot is bilingual (B/C)
    fav_bilingual = if_else(Q2_ballot == "A", 0L, 1L),

    # Outcome 2: among bilingual favorites only, prefer separated (C) over stacked (B)
    fav_arrangement = case_when(
      Q2_ballot == "B" ~ "stacked",
      Q2_ballot == "C" ~ "separated",
      TRUE ~ NA_character_
    ),
    fav_separated = case_when(
      Q2_ballot == "B" ~ 0L,
      Q2_ballot == "C" ~ 1L,
      TRUE ~ NA_integer_
    ),

    # demographics / predictors
    age  = survey_year - suppressWarnings(as.numeric(Q7_1)),
    male = case_when(
      Q8 == "Male" ~ 1L,
      Q8 == "Female" ~ 0L,
      TRUE ~ NA_integer_
    ),

    ideology = case_when(
      Q11 == "Strongly liberal" ~ 1,
      Q11 == "Liberal" ~ 2,
      Q11 == "Somewhat liberal" ~ 3,
      Q11 == "Moderate" ~ 4,
      Q11 == "Somewhat conservative" ~ 5,
      Q11 == "Conservative" ~ 6,
      Q11 == "Strongly conservative" ~ 7,
      TRUE ~ NA_real_
    ),

    spanish_prof = case_when(
      Q16 == "I can speak Spanish fluently" ~ 3,
      Q16 == "I can speak it well enough to get by in daily life" ~ 2,
      Q16 == "Very little" ~ 1,
      Q16 == "Not at all" ~ 0,
      TRUE ~ NA_real_
    ),

    spanish_prof_f = factor(
      spanish_prof,
      levels = c(0, 1, 2, 3),
      labels = c("None", "Little", "Functional", "Fluent")
    ),

    # Home language (handle multi-select robustly)
    Q14 = as.character(Q14),
    has_english = str_detect(Q14, "English"),
    has_spanish = str_detect(Q14, "Spanish"),
    has_other   = str_detect(Q14, "Language\\(s\\) other than English or Spanish"),

    speak_English_only = if_else(has_english & !has_spanish & !has_other, 1L, 0L),
    english_only_f = factor(
      speak_English_only,
      levels = c(0, 1),
      labels = c("Not English-only", "English-only")
    ),

    household_lang = case_when(
      has_english & !has_spanish & !has_other ~ "English only",
      has_english &  has_spanish              ~ "English + Spanish",
      !has_english & has_spanish & !has_other ~ "Spanish only",
      has_other & !has_english & !has_spanish ~ "Other only",
      TRUE                                    ~ "Mixed/Other"
    ),
    household_lang = factor(
      household_lang,
      levels = c("English only", "English + Spanish", "Spanish only", "Other only", "Mixed/Other")
    ),

    # party ID
    partisanship = case_when(
      Q9 == "Democrat" ~ "Democrat",
      Q9 == "Republican" ~ "Republican",
      Q9 == "Independent" ~ "Independent",
      Q9 == "Other" ~ "Other",
      TRUE ~ NA_character_
    ),
    partisanship = factor(partisanship, levels = c("Democrat", "Independent", "Republican", "Other")),

    # FE
    Q6 = factor(Q6),

    # race from multi-select strings in Q13
    Q13 = as.character(Q13),
    race5 = case_when(
      str_detect(Q13, "White") &
        !str_detect(Q13, "Black|Asian|American Indian|Alaska Native|Middle Eastern|Native Hawaiian|Pacific Islander") ~ "White",
      str_detect(Q13, "Black or African American") &
        !str_detect(Q13, "White|Asian|American Indian|Alaska Native|Middle Eastern|Native Hawaiian|Pacific Islander") ~ "Black",
      str_detect(Q13, "Asian") &
        !str_detect(Q13, "White|Black|American Indian|Alaska Native|Middle Eastern|Native Hawaiian|Pacific Islander") ~ "Asian",
      str_detect(Q13, "Middle Eastern") &
        !str_detect(Q13, "White|Black|Asian|American Indian|Alaska Native|Native Hawaiian|Pacific Islander") ~ "Middle Eastern",
      TRUE ~ "Multiracial/Other"
    ),
    race5 = factor(race5, levels = c("White", "Black", "Asian", "Middle Eastern", "Multiracial/Other"))
  )
```

### Check A: Does bilingual preference vary by Spanish proficiency / home language?

```{r Check A}
m_fav1 <- feglm(
  fav_bilingual ~ spanish_prof_f + household_lang +
    partisanship + ideology + race5 + age + male | Q6,
  family = "binomial",
  data = texas2
)

m_fav2 <- feglm(
  fav_bilingual ~ spanish_prof_f * partisanship +
    household_lang + ideology + race5 + age + male | Q6,
  family = "binomial",
  data = texas2
)

# cluster by university
summary(m_fav1, vcov = ~Q6)
summary(m_fav2, vcov = ~Q6)

# coeftables (clean)
summary(m_fav1, vcov = ~Q6)$coeftable
summary(m_fav2, vcov = ~Q6)$coeftable
```

```{r}
# plot
get_mode <- function(x) {
  x <- x[!is.na(x)]
  if (is.factor(x)) x <- droplevels(x)
  x <- as.character(x)
  x <- x[x != ""]
  if (length(x) == 0) return(NA_character_)
  names(which.max(table(x)))
}

# pediction grid for Spanish proficiency × partisanship 
newdat_party <- tidyr::expand_grid(
  spanish_prof_f = factor(levels(texas2$spanish_prof_f), levels = levels(texas2$spanish_prof_f)),
  partisanship   = factor(levels(texas2$partisanship),   levels = levels(texas2$partisanship))
) %>%
  mutate(
    household_lang = factor("English only", levels = levels(texas2$household_lang)),
    race5          = factor(get_mode(texas2$race5), levels = levels(texas2$race5)),
    male           = 0L,
    ideology       = median(texas2$ideology, na.rm = TRUE),
    age            = median(texas2$age, na.rm = TRUE)
  )


m_fav2_plot <- glm(
  fav_bilingual ~ spanish_prof_f * partisanship +
    household_lang + ideology + race5 + age + male,
  family = binomial(),
  data = texas2
)

newdat_party <- newdat_party %>%
  mutate(pred = predict(m_fav2_plot, newdata = ., type = "response"))

# Plot 
ggplot(newdat_party, aes(x = spanish_prof_f, y = pred, group = 1)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  facet_wrap(~ partisanship) +
  coord_cartesian(ylim = c(0.5, 1)) +
  labs(
    x = "Spanish proficiency",
    y = "Predicted Pr(Favorite ballot is bilingual)",
    title = "Bilingual ballot preference by Spanish proficiency",
    subtitle = "Panels by partisanship"
  ) +
  theme_classic(base_size = 12) +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))
```

### check B: among bilingual favorites (B/C), separated (C) vs stacked (B)
```{r}
texas_bi <- texas2 %>%
  filter(Q2_ballot %in% c("B", "C")) %>%
  drop_na(fav_separated)

m_arr1 <- feglm(
  fav_separated ~ spanish_prof_f + household_lang + partisanship + ideology + race5 + age + male | Q6,
  family = "binomial",
  data = texas_bi
)

m_arr2 <- feglm(
  fav_separated ~ spanish_prof_f * partisanship + household_lang + ideology + race5 + age + male | Q6,
  family = "binomial",
  data = texas_bi
)

summary(m_arr1, vcov = ~Q6)
summary(m_arr2, vcov = ~Q6)

summary(m_arr1, vcov = ~Q6)$coeftable
summary(m_arr2, vcov = ~Q6)$coeftable
```

```{r}
prop_data <- texas_bi %>%
  drop_na(spanish_prof_f) %>%
  group_by(spanish_prof_f) %>%
  summarise(
    prop_separated = mean(fav_separated == 1L, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  )

ggplot(prop_data, aes(x = spanish_prof_f, y = prop_separated)) +
  geom_line(group = 1) +
  geom_point(size = 3) +
  coord_cartesian(ylim = c(0, 1)) +
  labs(
    x = "Spanish proficiency",
    y = "Observed proportion preferring separated ballot (C)",
    title = "Observed layout preference among bilingual supporters",
    subtitle = "Restricted to respondents whose favorite ballot is bilingual (B or C)"
  ) +
  theme_classic(base_size = 12)
```

